Interpretable Classification for Hack4Rare¶

Email: sheng.xu@aa.com

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
import warnings
warnings.filterwarnings("ignore")
In [2]:
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())
In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show
In [4]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#from xgboost import XGBClassifier

from sklearn.linear_model import SGDClassifier, LogisticRegression

from interpret.glassbox import ExplainableBoostingClassifier
In [6]:
df = pd.read_csv('pbta-gene-counts-rsem-expected_count-collapsed.combined.filtered.target.highlow.500.T.zip')
In [7]:
df.shape
Out[7]:
(415, 482)
In [8]:
df.columns
Out[8]:
Index(['Unnamed: 0', 'MT-CO1', 'GFAP', 'MT-ND4', 'MT-CO3', 'EEF1A1', 'MT-CYB',
       'MBP', 'ACTB', 'SPARC',
       ...
       'ACSM2B', 'OR13C2', 'AC142391.1', 'SMIM28', 'AC018554.3', 'KLK8',
       'OR6N1', 'FGF6', 'target', 'HighLowGrade'],
      dtype='object', length=482)
In [9]:
df.head()
Out[9]:
Unnamed: 0 MT-CO1 GFAP MT-ND4 MT-CO3 EEF1A1 MT-CYB MBP ACTB SPARC ... ACSM2B OR13C2 AC142391.1 SMIM28 AC018554.3 KLK8 OR6N1 FGF6 target HighLowGrade
0 BS_0VXZCRJS 688399.97 768375.44 489755.90 240854.75 484337.54 213623.0 3068.00 286578.16 462590.00 ... 1.0 0.0 0.00 0.0 0.0 0.0 0.0 0.0 1.0 1.0
1 BS_3AC3SRWH 6885942.15 695792.99 3068246.64 2359968.34 145654.84 1838574.0 3673422.60 19284.02 4628.00 ... 0.0 0.0 2.00 0.0 0.0 0.0 1.0 0.0 1.0 1.0
2 BS_4PPHAQXF 1042630.21 249792.48 651345.95 390247.42 628701.58 253190.0 1445.64 230195.17 266144.00 ... 0.0 0.0 0.00 0.0 1.0 0.0 0.0 0.0 0.0 1.0
3 BS_4PWDGEB0 914337.23 17581.45 588803.00 349555.99 305689.05 269897.0 809.00 217459.08 252539.99 ... 0.0 2.0 4.97 0.0 0.0 0.0 0.0 0.0 1.0 1.0
4 BS_58YXHGAJ 617360.80 477984.96 398712.00 248921.83 483424.31 189343.0 105316.27 171486.03 24650.99 ... 0.0 0.0 0.00 0.0 0.0 1.0 0.0 0.0 0.0 1.0

5 rows × 482 columns

In [10]:
train_cols = df.columns[1:3001]
train_cols = df.columns[1:-2]
label = df.columns[-1]
#X = df[train_cols]
y = df[label]
In [11]:
seed = 1
df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.20, random_state=seed)
In [12]:
df_test.to_csv("df_test.csv")
In [13]:
df_train.shape
Out[13]:
(332, 482)
In [14]:
df_test.shape
Out[14]:
(83, 482)
In [15]:
df_train.head()
Out[15]:
Unnamed: 0 MT-CO1 GFAP MT-ND4 MT-CO3 EEF1A1 MT-CYB MBP ACTB SPARC ... ACSM2B OR13C2 AC142391.1 SMIM28 AC018554.3 KLK8 OR6N1 FGF6 target HighLowGrade
159 BS_D29RPBSZ 452321.54 216161.0 199327.42 118267.33 54390.97 126867.0 871.00 86782.57 20818.00 ... 1.0 0.5 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0
95 BS_5CTVXVRX 181744.03 91.0 17946.99 135002.63 1025556.42 16899.0 376.62 166512.70 26949.00 ... 1.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0
11 BS_PXCPK5XS 452627.50 17821.9 189167.00 181179.84 426570.13 140298.0 587.40 235700.00 213354.99 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0
374 BS_YZ2Z1Q6Y 721220.89 377771.0 410082.00 205727.94 84671.03 219170.0 18584.00 46338.61 120781.00 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
165 BS_C7A2TYAC 524802.48 968914.0 219724.00 179642.96 237301.95 123222.0 43642.00 318150.53 110694.00 ... 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 482 columns

In [16]:
df_test.head()
Out[16]:
Unnamed: 0 MT-CO1 GFAP MT-ND4 MT-CO3 EEF1A1 MT-CYB MBP ACTB SPARC ... ACSM2B OR13C2 AC142391.1 SMIM28 AC018554.3 KLK8 OR6N1 FGF6 target HighLowGrade
388 BS_862NMAR7 757880.34 1431919.0 465653.84 304550.91 379905.19 228292.0 44760.00 386329.89 246661.0 ... 0.0 6.0 0.0 1.0 0.0 0.0 1.0 3.0 1.0 1.0
102 BS_66HQ3E4Z 466915.21 1208314.0 217188.92 155404.86 228500.94 117132.0 38553.00 305845.75 217380.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
187 BS_EFEZB0ZH 322791.13 1740657.0 147939.87 121399.91 213259.97 91998.0 7386.00 288555.92 496903.0 ... 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0
162 BS_C51RB0YR 1150642.36 222780.0 499514.99 396129.86 296533.34 363720.0 2165.00 410370.93 702442.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
90 BS_52426AMF 656078.15 1845002.0 356146.08 203549.33 251682.86 169410.0 45588.73 286209.12 273309.0 ... 0.0 3.0 0.0 0.0 0.0 10.0 0.0 0.0 0.0 0.0

5 rows × 482 columns

In [17]:
X_train = df_train[train_cols]
X_test = df_test[train_cols]
In [18]:
X_train.head()
Out[18]:
MT-CO1 GFAP MT-ND4 MT-CO3 EEF1A1 MT-CYB MBP ACTB SPARC VIM ... Z84492.1 IFNW1 ACSM2B OR13C2 AC142391.1 SMIM28 AC018554.3 KLK8 OR6N1 FGF6
159 452321.54 216161.0 199327.42 118267.33 54390.97 126867.0 871.00 86782.57 20818.00 105224.00 ... 8.71 1.0 1.0 0.5 0.0 0.0 1.0 0.0 0.0 0.0
95 181744.03 91.0 17946.99 135002.63 1025556.42 16899.0 376.62 166512.70 26949.00 66706.00 ... 0.00 0.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0
11 452627.50 17821.9 189167.00 181179.84 426570.13 140298.0 587.40 235700.00 213354.99 151174.56 ... 0.00 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
374 721220.89 377771.0 410082.00 205727.94 84671.03 219170.0 18584.00 46338.61 120781.00 15499.00 ... 0.55 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
165 524802.48 968914.0 219724.00 179642.96 237301.95 123222.0 43642.00 318150.53 110694.00 142524.00 ... 0.00 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 479 columns

In [19]:
X_test.head()
Out[19]:
MT-CO1 GFAP MT-ND4 MT-CO3 EEF1A1 MT-CYB MBP ACTB SPARC VIM ... Z84492.1 IFNW1 ACSM2B OR13C2 AC142391.1 SMIM28 AC018554.3 KLK8 OR6N1 FGF6
388 757880.34 1431919.0 465653.84 304550.91 379905.19 228292.0 44760.00 386329.89 246661.0 89902.0 ... 8.35 0.0 0.0 6.0 0.0 1.0 0.0 0.0 1.0 3.0
102 466915.21 1208314.0 217188.92 155404.86 228500.94 117132.0 38553.00 305845.75 217380.0 72290.0 ... 0.00 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
187 322791.13 1740657.0 147939.87 121399.91 213259.97 91998.0 7386.00 288555.92 496903.0 108977.0 ... 0.00 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0
162 1150642.36 222780.0 499514.99 396129.86 296533.34 363720.0 2165.00 410370.93 702442.0 296294.0 ... 0.00 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
90 656078.15 1845002.0 356146.08 203549.33 251682.86 169410.0 45588.73 286209.12 273309.0 156027.0 ... 0.00 0.0 0.0 3.0 0.0 0.0 0.0 10.0 0.0 0.0

5 rows × 479 columns

In [20]:
y
Out[20]:
0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
410    1.0
411    1.0
412    1.0
413    1.0
414    1.0
Name: HighLowGrade, Length: 415, dtype: float64
In [21]:
#seed = 1
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)
In [22]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from interpret.glassbox import LogisticRegression
from interpret import show
In [23]:
ebm = ExplainableBoostingClassifier(random_state=seed)
ebm.fit(X_train, y_train)
Out[23]:
ExplainableBoostingClassifier(feature_names=['MT-CO1', 'GFAP', 'MT-ND4',
                                             'MT-CO3', 'EEF1A1', 'MT-CYB',
                                             'MBP', 'ACTB', 'SPARC', 'VIM',
                                             'PLP1', 'TUBA1A', 'MT-ND3', 'FTH1',
                                             'APOE', 'AQP4', 'BCAN', 'FTL',
                                             'SPP1', 'RPL3', 'TUBA1B', 'SCD',
                                             'EEF1G', 'APP', 'HNRNPA1', 'IGF2',
                                             'CST3', 'RPS6', 'HSP90AA1',
                                             'CHI3L1', ...],
                              feature_types=['continuous', 'continuous',
                                             'continuous', 'continuous',
                                             '...ous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous', ...],
                              random_state=1)
In [24]:
ebm_global = ebm.explain_global()
show(ebm_global)
In [25]:
X_train.shape
Out[25]:
(332, 479)
In [26]:
ebm_local2 = ebm.explain_local(X_train[:642], y_train[:642])
show(ebm_local2)
In [27]:
X_test.shape
Out[27]:
(83, 479)
In [28]:
ebm_local = ebm.explain_local(X_test[:161], y_test[:161])
show(ebm_local)
In [29]:
from interpret.perf import ROC

ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)